#!/usr/bin/python
# coding=utf-8

# 20180804
# author：葛木瓜
# 从西安房管局意向登记平台爬取已完成的登记信息
# 长安区登记平台
# http://124.115.228.93/zfrgdjpt/xmgsca.aspx
# 非长安区登记平台
# http://124.115.228.93/zfrgdjpt/xmgs.aspx


# from autoGetRegisDatas.writeCsv import write_csv
from bs4 import BeautifulSoup
from urllib import request
import re
import sys


class GetRegisData:

    """
    获取房源登记信息
    1、即将开始的房源，包括：项目数、房源名称、总套数
    2、正在登记的房源，包括：项目数、房源名称、总套数、当前登记数量、距离截止时间
    """

    def __init__(self, url):
        """
        将页面用BeautifulSoup库处理
        :return:
        """
        self.url = url
        self.header = {
            "User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:59.0) Gecko/20100101 Firefox/59.0"
        }
        self.req = request.Request(self.url, headers=self.header)
        try:
            html = request.urlopen(self.req).read().decode('utf-8')
        except Exception as e:
            print('Connection is reset: %s' % e)
            sys.exit()
        else:
            self.soup = BeautifulSoup(html, 'html.parser')

    def get_page_url(self):

        """
        从页面爬取所有分页url存入列表中，如无则返回空
        :return:
        """
        page_url = [self.url]
        paging = self.soup.find_all(href=re.compile('page'))
        if paging:
            for paging_i in range(len(paging)-1):
                page_url.append(self.url.split('xmgs')[0] + paging[paging_i]['href'])
            page_url.sort()
        return page_url

    def get_num(self, status):

        """
        status =  全部/正在登记/暂未开始/登记结束
        :return: 数量
        """
        span = self.soup.find_all('span', limit=4)
        for span_i in span:
            if status in span_i.text:
                return span_i.text.split('(')[1].split(')')[0]

    def get_data(self):

        """
        获取楼盘的信息
        :return:
        """
        xm_dict = {}
        xm_lst = self.soup.find_all(class_="xmgsItem")
        for xm_lst_i in xm_lst:
            data_lst = []   # 每次循环确保初始化list
            xm_name = xm_lst_i.div.div.span.text.strip(' ')     # 项目名
            data_lst.append(xm_lst_i.find_all('span', title=re.compile("全部可售房源"))[0].get_text().split('共')[1].split(")")[0])    # 房源数
            data_lst.append(xm_lst_i.find_all(string=re.compile("至 "))[0])  # 网上登记时间 现场接受资料时间
            data_lst.append(xm_lst_i.find_all(string=re.compile("至 "))[1])  # 现场接受资料时间
            data_lst.append(xm_lst_i.find_all(style='word-break:break-all;word-wrap:break-word;')[0].get_text())   # 现场接受资料地点
            time_left = xm_lst_i.find_all(string=re.compile('还剩'))
            if time_left:
                data_lst.append(time_left[0])  # 剩余时间
            xm_dict[xm_name] = data_lst
        return xm_dict

